03-148. AI를 활용한 비정형 데이터 분석 copy

AI를 활용한 비정형데이터 분석

최근 생성형 AI의 발전으로 비정형데이터 분석이 더욱 효율적으로 변화하고 있다.

LLM을 활용한 텍스트 분석

def analyze_text_with_llm(text_data):
    """
    LLM을 사용한 텍스트 분석 (의사코드)
    실제 사용시에는 OpenAI API 키가 필요합니다.
    """
    
    # 예시: 구조화된 분석 결과 반환
    analysis_results = []
    
    for text in text_data:
        # 실제로는 LLM API 호출
        # response = openai.ChatCompletion.create(...)
        
        # 예시 결과
        result = {
            'original_text': text,
            'sentiment': 'positive' if '좋' in text or '만족' in text else 'negative',
            'keywords': ['품질', '배송', '서비스'],
            'category': '제품리뷰',
            'urgency': 'low',
            'summary': text[:30] + '...'
        }
        analysis_results.append(result)
    
    return pd.DataFrame(analysis_results)

# 사용 예시
sample_texts = [
    "제품 품질이 정말 좋네요! 배송도 빠르고 만족합니다.",
    "서비스가 별로예요. 고객센터 응답이 너무 늦어요.",
    "가격 대비 괜찮은 것 같아요. 디자인이 예뻐요."
]

llm_results = analyze_text_with_llm(sample_texts)
print("LLM 분석 결과:")
print(llm_results)

AI가 다양한 파일을 입력받아 분석하도록 할 수 있다.

def multimodal_analysis(text_data, image_data):
    """텍스트와 이미지를 함께 분석"""
    
    combined_results = []
    
    for i, (text, img_features) in enumerate(zip(text_data, image_data)):
        # 텍스트 분석
        text_sentiment = analyze_sentiment(text)
        
        # 이미지 분석 (밝기 기반 간단한 분석)
        img_mood = 'bright' if img_features['평균_밝기'] > 128 else 'dark'
        
        # 결합 분석
        combined_result = {
            'id': i+1,
            'text': text,
            'text_sentiment': text_sentiment,
            'image_mood': img_mood,
            'consistency': 'consistent' if (
                (text_sentiment == 'positive' and img_mood == 'bright') or
                (text_sentiment == 'negative' and img_mood == 'dark')
            ) else 'inconsistent'
        }
        
        combined_results.append(combined_result)
    
    return pd.DataFrame(combined_results)

# 예시 데이터
sample_texts = ["정말 좋은 제품이에요!", "별로 마음에 안 들어요"]
sample_img_features = [
    {'평균_밝기': 180},  # 밝은 이미지
    {'평균_밝기': 80}    # 어두운 이미지
]

multimodal_results = multimodal_analysis(sample_texts, sample_img_features)
print("멀티모달 분석 결과:")
print(multimodal_results)

실무 활용 방안

1. 고객 피드백 분석 시스템

class CustomerFeedbackAnalyzer:
    def __init__(self):
        self.feedback_data = []
        self.analysis_results = {}
    
    def collect_feedback(self, source, content, metadata=None):
        """다양한 소스에서 피드백 수집"""
        feedback = {
            'timestamp': datetime.now(),
            'source': source,  # 'review', 'social_media', 'survey' 등
            'content': content,
            'metadata': metadata or {}
        }
        self.feedback_data.append(feedback)
    
    def analyze_feedback(self):
        """피드백 종합 분석"""
        if not self.feedback_data:
            return "분석할 데이터가 없습니다."
        
        df = pd.DataFrame(self.feedback_data)
        
        # 감정 분석
        df['sentiment'] = df['content'].apply(analyze_sentiment)
        
        # 키워드 추출
        all_content = ' '.join(df['content'])
        word_freq = Counter(all_content.split())
        
        # 소스별 분석
        source_analysis = df.groupby('source')['sentiment'].value_counts()
        
        self.analysis_results = {
            'total_feedback': len(df),
            'sentiment_distribution': df['sentiment'].value_counts().to_dict(),
            'top_keywords': dict(word_freq.most_common(10)),
            'source_analysis': source_analysis.to_dict(),
            'recent_feedback': df.tail(5)[['source', 'content', 'sentiment']].to_dict('records')
        }
        
        return self.analysis_results
    
    def generate_report(self):
        """분석 리포트 생성"""
        if not self.analysis_results:
            self.analyze_feedback()
        
        report = f"""
=== 고객 피드백 분석 리포트 ===

📊 전체 피드백 수: {self.analysis_results['total_feedback']}건

😊 감정 분포:
"""
        for sentiment, count in self.analysis_results['sentiment_distribution'].items():
            percentage = (count / self.analysis_results['total_feedback']) * 100
            report += f"  - {sentiment}: {count}건 ({percentage:.1f}%)\n"
        
        report += f"\n🔍 주요 키워드:\n"
        for word, freq in list(self.analysis_results['top_keywords'].items())[:5]:
            report += f"  - {word}: {freq}회\n"
        
        return report

# 사용 예시
analyzer = CustomerFeedbackAnalyzer()

# 피드백 수집
analyzer.collect_feedback('review', '제품 품질이 정말 좋아요!', {'rating': 5})
analyzer.collect_feedback('social_media', '배송이 너무 늦어요 ㅠㅠ', {'platform': 'twitter'})
analyzer.collect_feedback('survey', '가격 대비 만족스럽습니다', {'survey_id': 'S001'})

# 분석 및 리포트 생성
report = analyzer.generate_report()
print(report)

2. 실시간 모니터링 시스템

import time
from datetime import datetime, timedelta

class RealTimeMonitor:
    def __init__(self):
        self.alerts = []
        self.metrics = {
            'error_rate': 0,
            'response_time': 0,
            'user_sentiment': 'neutral'
        }
    
    def process_log_stream(self, log_entry):
        """실시간 로그 처리"""
        parsed_log = parse_log_line(log_entry)
        
        if parsed_log:
            # 에러율 계산
            if parsed_log['상태코드'] >= 400:
                self.metrics['error_rate'] += 1
                
                # 임계값 초과시 알림
                if self.metrics['error_rate'] > 5:  # 5개 이상 에러
                    self.create_alert('HIGH_ERROR_RATE', 
                                    f"에러율 임계값 초과: {self.metrics['error_rate']}건")
    
    def process_feedback_stream(self, feedback):
        """실시간 피드백 처리"""
        sentiment = analyze_sentiment(feedback)
        
        # 부정적 피드백 급증 감지
        if sentiment == 'negative':
            self.create_alert('NEGATIVE_FEEDBACK', 
                            f"부정적 피드백 감지: {feedback[:50]}...")
    
    def create_alert(self, alert_type, message):
        """알림 생성"""
        alert = {
            'timestamp': datetime.now(),
            'type': alert_type,
            'message': message,
            'severity': 'high' if 'HIGH' in alert_type else 'medium'
        }
        self.alerts.append(alert)
        print(f"🚨 ALERT: {alert['message']}")
    
    def get_dashboard_data(self):
        """대시보드용 데이터 반환"""
        recent_alerts = [alert for alert in self.alerts 
                        if alert['timestamp'] > datetime.now() - timedelta(hours=1)]
        
        return {
            'current_metrics': self.metrics,
            'recent_alerts': recent_alerts,
            'alert_count': len(recent_alerts)
        }

# 사용 예시
monitor = RealTimeMonitor()

# 실시간 데이터 처리 시뮬레이션
sample_logs = [
    '192.168.1.1 - - [10/Oct/2023:14:00:01 +0900] "GET /api/data HTTP/1.1" 500 0',
    '192.168.1.2 - - [10/Oct/2023:14:00:02 +0900] "GET /api/data HTTP/1.1" 500 0'
]

sample_feedback = [
    "서버가 자꾸 오류나요. 너무 불편해요.",
    "사이트가 느려서 짜증나네요."
]

for log in sample_logs:
    monitor.process_log_stream(log)

for feedback in sample_feedback:
    monitor.process_feedback_stream(feedback)

# 대시보드 데이터 확인
dashboard_data = monitor.get_dashboard_data()
print(f"\n📊 대시보드 현황:")
print(f"현재 메트릭: {dashboard_data['current_metrics']}")
print(f"최근 알림 수: {dashboard_data['alert_count']}건")

비정형데이터 분석의 도전과제와 해결방안

주요 도전과제

데이터 품질 문제
- 노이즈가 많은 데이터
- 불완전하거나 일관성 없는 데이터
- 편향된 데이터
처리 복잡성
- 높은 계산 비용
- 복잡한 전처리 과정
- 실시간 처리의 어려움
해석의 어려움
- 주관적 해석 가능성
- 컨텍스트 의존성
- 문화적/언어적 차이

해결방안

class DataQualityChecker:
    """데이터 품질 검사 도구"""
    
    def __init__(self):
        self.quality_metrics = {}
    
    def check_text_quality(self, texts):
        """텍스트 데이터 품질 검사"""
        quality_issues = []
        
        for i, text in enumerate(texts):
            issues = []
            
            # 길이 검사
            if len(text.strip()) < 10:
                issues.append('too_short')
            
            # 특수문자 비율 검사
            special_char_ratio = len(re.findall(r'[^\w\s]', text)) / len(text)
            if special_char_ratio > 0.3:
                issues.append('too_many_special_chars')
            
            # 반복 패턴 검사
            if len(set(text.split())) / len(text.split()) < 0.5:
                issues.append('repetitive_content')
            
            if issues:
                quality_issues.append({
                    'index': i,
                    'text': text[:50] + '...',
                    'issues': issues
                })
        
        return quality_issues
    
    def suggest_improvements(self, quality_issues):
        """품질 개선 제안"""
        suggestions = []
        
        for issue in quality_issues:
            for problem in issue['issues']:
                if problem == 'too_short':
                    suggestions.append(f"텍스트 {issue['index']}: 더 상세한 내용 필요")
                elif problem == 'too_many_special_chars':
                    suggestions.append(f"텍스트 {issue['index']}: 특수문자 정제 필요")
                elif problem == 'repetitive_content':
                    suggestions.append(f"텍스트 {issue['index']}: 중복 내용 제거 필요")
        
        return suggestions

# 사용 예시
quality_checker = DataQualityChecker()

sample_texts = [
    "좋아요",  # 너무 짧음
    "!@#$%^&*()!@#$%^&*()",  # 특수문자 과다
    "좋아요 좋아요 좋아요 좋아요 좋아요",  # 반복적
    "이 제품은 정말 훌륭합니다. 품질도 좋고 가격도 합리적이에요."  # 정상
]

issues = quality_checker.check_text_quality(sample_texts)
suggestions = quality_checker.suggest_improvements(issues)

print("=== 데이터 품질 검사 결과 ===")
for suggestion in suggestions:
    print(f"💡 {suggestion}")

마무리

핵심 포인트

비정형데이터의 중요성: 전체 데이터의 80% 이상을 차지하며 귀중한 인사이트 제공
다양한 분석 기법: 텍스트 마이닝, 이미지 분석, 로그 분석 등 각 데이터 유형별 특화 기법
생성형 AI의 활용: LLM을 통한 효율적인 비정형데이터 처리 및 분석
실무 적용: 고객 피드백 분석, 실시간 모니터링 등 다양한 비즈니스 활용 사례

향후 전망

AI 기술 발전: 더욱 정교한 자연어처리 및 컴퓨터 비전 기술
실시간 처리: 스트리밍 데이터 처리 기술의 발전
멀티모달 분석: 텍스트, 이미지, 음성을 통합한 분석 기법
자동화: 전처리부터 인사이트 도출까지 자동화된 파이프라인

비정형데이터 분석은 현대 데이터 과학의 핵심 영역으로, 적절한 도구와 기법을 활용하면 정형데이터에서는 얻을 수 없는 깊이 있는 인사이트를 제공할 수 있다. 특히 생성형 AI의 발전으로 비정형데이터 분석이 더욱 접근하기 쉬워지고 있어, 앞으로 더 많은 분야에서 활용될 것으로 예상된다.

AI를 활용한 비정형데이터 분석​

LLM을 활용한 텍스트 분석​

멀티모달 분석 - Multi modal analysis​

실무 활용 방안​

1. 고객 피드백 분석 시스템​

2. 실시간 모니터링 시스템​

비정형데이터 분석의 도전과제와 해결방안​

주요 도전과제​

해결방안​

마무리​